import pandas as pd
import os
from directory_tree import display_tree
from sklearn.metrics import roc_auc_score
from pathlib import Path
import duckdb
import ibis
from ibis import _
from ibis import selectors as s
import re
from importlib import reload
from IPython.display import display, HTML
# reload(tools)
from tools import FeatureDefinition, pretty_display
import altair as alt
from functools import reduce
= 'home-credit-credit-risk-model-stability/parquet_files/train'
DATA_DIR = Path(DATA_DIR)
p
=True
ibis.options.interactive
# load data into directory
=list(p.glob('*.parquet'))#load_all data
data_files= {re.sub('.parquet','',f.name):ibis.read_parquet(f) for f in data_files} ds
Why train_static
? Please this dataset don’t seems to have cardinality problems.
from importlib import reload
from IPython.display import display, HTML
# reload(tools)
from tools import FeatureDefinition, pretty_display
# lookup column definition here if you don't know what
= [
intersting_cols 'bankacctype_710L',
'credtype_322L','disbursementtype_67L',
'inittransactioncode_186L','lastst_736L','paytype1st_925L','paytype_783L',
'twobodfilling_608L','typesuite_864L'
]
pretty_display(
FeatureDefinition().lookup_col(intersting_cols) )
Variable.str.contains("bankacctype_710L") or Variable.str.contains("credtype_322L") or Variable.str.contains("disbursementtype_67L") or Variable.str.contains("inittransactioncode_186L") or Variable.str.contains("lastst_736L") or Variable.str.contains("paytype1st_925L") or Variable.str.contains("paytype_783L") or Variable.str.contains("twobodfilling_608L") or Variable.str.contains("typesuite_864L")
Variable | Description | File | |
---|---|---|---|
0 | bankacctype_710L | Type of applicant's bank account. | [train_static_0_0, train_static_0_1] |
1 | credtype_322L | Type of credit. | [train_static_0_0, train_static_0_1] |
2 | disbursementtype_67L | Type of disbursement. | [train_static_0_0, train_static_0_1] |
3 | inittransactioncode_186L | Transaction type of the initial credit transaction. |
[train_static_0_0, train_static_0_1] |
4 | lastst_736L | Status of the client's previous credit application. |
[train_static_0_0, train_static_0_1] |
5 | paytype1st_925L | Type of first payment of the client. | [train_static_0_0, train_static_0_1] |
6 | paytype_783L | Type of payment. | [train_static_0_0, train_static_0_1] |
7 | twobodfilling_608L | Type of application process. | [train_static_0_0, train_static_0_1] |
8 | typesuite_864L | Persons accompanying the client during the loan application process. |
[train_static_0_0, train_static_0_1] |
Time Sensitivity?
Maybe not to include time-features in as training dataset.
import matplotlib.pyplot as plt
import seaborn as sns
import tools
=(
d'train_base')
ds.get(=_.date_decision.to_timestamp('%Y-%m-%d').date())
.mutate(date_decision=_.WEEK_NUM)
.mutate(dm
.group_by([_.dm, _.target])=_.count())
.aggregate(n
.to_pandas()
)# d
import altair as alt
=alt.Chart(
c0'train_base')
ds.get(=_
.mutate(date_decision
.date_decision'%Y-%m-%d').date()
.to_timestamp('M')
.truncate(
)
.group_by([_.date_decision])=_.count())
.aggregate(n
.to_pandas()
,=['Loan Approval Volumn',
title'There is a sharp drope during covid'
=150, width=670
],height='midnightblue').encode(
).mark_line(color=alt.X('date_decision',title='Decision Date'),
x=alt.Y('n',title='Loan Approved')
y
)=alt.Chart(d,title='Volumn of Approval and Deafult').mark_area().encode(
c1=alt.X('dm',title='Week Number'),
x=alt.Y('n',title='Number of Case'),
y='target:N'
color
)=alt.Chart(d,title='Propotion of Default Overtime').mark_area().encode(
c2=alt.X('dm',title='Week Number'),
x=alt.Y('n',title='Propotion of Case').stack('normalize'),
y='target:N'
color
)& (c1 | c2) c0
reload(tools)
from tools import FeatureDefinition
pretty_display('change')
FeatureDefinition().lookup_dsc('File=="train_static_0_0"')
.query( )
Variable | File | Description | |
---|---|---|---|
415 | equalitydataagreement_891L | train_static_0_0 | Flag indicating sudden changes in client's social- demographic data (e.g. education, family status, housing type). |
417 | equalityempfrom_62L | train_static_0_0 | Flag indicating a sudden change in the client's length of employment. |
Explore Categorical Variables
=FeatureDefinition()
fd= [
intersting_cols 'equalitydataagreement_891L',
'equalityempfrom_62L',
'bankacctype_710L',
'credtype_322L','disbursementtype_67L',
'inittransactioncode_186L','lastst_736L','paytype1st_925L','paytype_783L',
'twobodfilling_608L','typesuite_864L',
'lastrejectreasonclient_4145040M'
]# pretty_display(
# fd.lookup_col(intersting_cols)
# )
=(
d2
ibis.union('train_static_0_0'),
ds.get('train_static_0_1')
ds.get(
)+ ['case_id']))
.select(s.contains(intersting_cols 'train_base'), ['case_id'])
.join(ds.get(
)= []
C for c in intersting_cols:
# display(d2.select(c,'target').head(1))
= fd.lookup_col(c).Description.str.wrap(30).loc[0]
description # print(description)
=(d2
d'target'])
.group_by([c,=_.count())
.aggregate(n
.to_pandas())=alt.Chart(
ct1
d
).mark_bar().encode(=alt.Y(c + ':N', title=description.split('\n')),
y=alt.X('n:Q',title=''),
x='target:N'
color
)=alt.Chart(
ct2=100
d,width
).mark_bar().encode(=alt.Y(c + ':N', title='', axis=None),
y=alt.X('n:Q',title='').stack('normalize'),
x='target:N',
color
)=ct1 | ct2
ct+= [ct]
C reduce(alt.vconcat, C).configure_axisY(
=0,
titleAngle="left",
titleAlign=10,
titleY=-200,
titleX='#404040',
titleColor='lighter'
titleFontWeight
).properties(=[
title"Categorical Variable Appears in Static Dataset",
"Nothing distinct by propotion except 'Status of the client's previous application'"
] )
pretty_display(fd'static_0_0')
.lookup_tbl('Variable.str.endswith("M")')
.query(=True)) .reset_index(drop
Variable | File | Description | |
---|---|---|---|
0 | lastapprcommoditycat_1041M | train_static_0_0 | Commodity category of the last loan applications made by the applicant. |
1 | lastapprcommoditytypec_5251766M | train_static_0_0 | Commodity type of the last application. |
2 | lastcancelreason_561M | train_static_0_0 | Cancellation reason of the last application. |
3 | lastrejectcommoditycat_161M | train_static_0_0 | Category of commodity in the applicant's last rejected application. |
4 | lastrejectcommodtypec_5251769M | train_static_0_0 | Commodity type of the last rejected application. |
5 | lastrejectreason_759M | train_static_0_0 | Reason for rejection on the most recent rejected application. |
6 | lastrejectreasonclient_4145040M | train_static_0_0 | Reason for the client's last loan rejection. |
7 | previouscontdistrict_112M | train_static_0_0 | Contact district of the client's previous approved application. |
=(
d'train_static_0_0')
ds.get('train_base'),'case_id')
.join(ds.get('M'), _.target, _.case_id)
.select(s.endswith('WEEK_NUM')
.drop(# .pivot_longer(
# s.endswith('M')
# )
# .group_by([_.name,_.value])
# .aggregate(n=_.count())
)
(d'target')
.select(c, 'target')])
.group_by([s.contains(c),s.contains(=_.count())
.aggregate(n .to_pandas())
previouscontdistrict_112M | target | n | |
---|---|---|---|
0 | P6_35_77 | 0 | 2506 |
1 | P41_138_103 | 0 | 1766 |
2 | P54_133_26 | 0 | 8831 |
3 | P197_47_166 | 0 | 34040 |
4 | P111_135_181 | 0 | 11957 |
... | ... | ... | ... |
393 | P159_160_144 | 1 | 27 |
394 | P217_60_135 | 1 | 16 |
395 | P78_30_175 | 1 | 26 |
396 | P7_110_89 | 1 | 18 |
397 | P31_42_128 | 1 | 7 |
398 rows × 3 columns
=FeatureDefinition()
fd= [i for i in d.columns if i not in ['target', "case_id"]]
mcols =[]
Cfor c in mcols:
=(d
data'target')
.select(c, 'target')])
.group_by([s.contains(c),s.contains(=_.count())
.aggregate(n
.to_pandas())=fd.lookup_col(c).Description.str.wrap(30).loc[0]
description=(
cl1
alt
.Chart(data)
.mark_bar()
.encode(='n:Q',
x=alt.Y(c + ':N',title=description.split('\n')),
y='target:N'
color
)
)=(
cl2
alt=100)
.Chart(data,width
.mark_bar(
)
.encode(=alt.X('n:Q',title='').stack('normalize'),
x=alt.Y(c + ':N',title='',axis=None),
y='target:N'
color
)
)+=[cl1 | cl2]
C=reduce(alt.vconcat,C)
CH
CH.configure_axisY(=0,
titleAngle="left",
titleAlign=10,
titleY=-250,
titleX='#404040',
titleColor='lighter'
titleFontWeight
).properties(='All Category Variable'
title )
# alt.data_transformers.enable("vegafusion")
# alt.Chart(d.to_pandas()).transform_fold(
# [i for i in d.columns if i not in ['target', "case_id"]]
# ).mark_bar().encode(
# x='count():Q',
# y='value:N',
# column='key:N'
# )
d
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━┓ ┃ lastapprcommoditycat_1041M ┃ lastapprcommoditytypec_5251766M ┃ lastcancelreason_561M ┃ lastrejectcommoditycat_161M ┃ lastrejectcommodtypec_5251769M ┃ lastrejectreason_759M ┃ lastrejectreasonclient_4145040M ┃ previouscontdistrict_112M ┃ target ┃ case_id ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━┩ │ string │ string │ string │ string │ string │ string │ string │ string │ int64 │ int64 │ ├────────────────────────────┼─────────────────────────────────┼───────────────────────┼─────────────────────────────┼────────────────────────────────┼───────────────────────┼─────────────────────────────────┼───────────────────────────┼────────┼─────────┤ │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ 0 │ 0 │ │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ 0 │ 2 │ │ a55475b1 │ a55475b1 │ P94_109_143 │ a55475b1 │ a55475b1 │ P94_109_143 │ a55475b1 │ a55475b1 │ 0 │ 3 │ │ a55475b1 │ a55475b1 │ P94_109_143 │ a55475b1 │ a55475b1 │ P94_109_143 │ a55475b1 │ a55475b1 │ 0 │ 6 │ │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ 0 │ 7 │ │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ 0 │ 8 │ │ a55475b1 │ a55475b1 │ P73_130_169 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ 0 │ 10 │ │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ 0 │ 11 │ │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ 0 │ 12 │ │ a55475b1 │ a55475b1 │ P94_109_143 │ a55475b1 │ a55475b1 │ P94_109_143 │ a55475b1 │ a55475b1 │ 0 │ 13 │ │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ └────────────────────────────┴─────────────────────────────────┴───────────────────────┴─────────────────────────────┴────────────────────────────────┴───────────────────────┴─────────────────────────────────┴───────────────────────────┴────────┴─────────┘